
from pyspark import *
import os

if __name__ == '__main__':

    conf = SparkConf().setAppName("2_reduce").setMaster("local[*]")
    sc = SparkContext(conf=conf)
    rdd1 = sc.parallelize([("hadoop", 1,1), ("java", 2,2), ("python", 3,3), ("python", 10,10)])
    rdd2 = sc.parallelize([1,2,3,4,5],3)
    rdd3=rdd1.reduce(lambda a,b:a+b)
    print(rdd1.collect())
    print(rdd3)
    rdd4 = rdd2.reduce(lambda a,b: a + b)
    print(rdd4)
    # 初始值作用于分区间和分区内聚合
    print(rdd2.fold(10, lambda a, b: a + b))
    print(rdd2.count())
    print(rdd1.countByKey())
    # 取前x个元素，不会排序
    print(rdd1.take(5))
    # 降序取前x个元素
    print(rdd1.top(2))
    print(rdd1.first())
    rdd22 = sc.parallelize([1, 2, 3, 4, 5,6,7,8,9,10,11], 3)
    # 收集到driver
    # 第一个参数 是否可以放回抽样
    print(rdd22.takeSample(True,20,3))
    print(os.popen())



